# For data prep
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
# For PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import math
import bisect
# For Clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
# For plotting
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotnine import *
# For writeup
from IPython.display import HTML
from IPython.core.display import HTML as Center
import warnings
warnings.filterwarnings('ignore')
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
proj_theme = theme(
strip_background=element_rect(fill="white"),
legend_title=element_blank(),
legend_text=element_text(size=9),
legend_position=(0.5, 0.93),
legend_direction="horizontal",
legend_box_background=(
element_rect(fill="transparent", color="transparent")
),
axis_text_x=element_text(size=10, color="#722502"),
axis_text_y=element_text(size=10, color="#722502", face="bold"),
text=element_text(family="sans", color="#722502", size=18),
axis_title_x=(
element_text(color='#DA4D2E', size=12, face="bold", margin={'t': 15})
),
axis_title_y=(
element_text(color='#DA4D2E', size=12, face="bold", margin={'r': 15})
),
plot_title=(
element_text(hjust=0.5, size=14, face="bold", margin={'b': 33})
),
panel_spacing=0.5,
panel_background=element_rect(fill="white", color="white"),
plot_background=element_rect(fill="white"),
panel_grid_major_x=element_line(colour="white"),
panel_grid_major_y=element_line(colour="#cfa544", linetype="dashed"),
panel_grid_minor=element_blank(),
strip_text_x=(
element_text(size=14, hjust=0.5, color="#722502", face="bold")
)
)
proj_theme2 = theme(
strip_background=element_rect(fill="white"),
legend_title=element_blank(),
legend_text=element_text(size=9),
legend_position=(0.5, 0.93),
legend_direction="horizontal",
legend_box_background=(
element_rect(fill="transparent", color="transparent")
),
axis_text_x=element_text(size=10, color="#722502"),
axis_text_y=element_text(size=10, color="#722502", face="bold"),
text=element_text(family="sans", color="#722502", size=14),
axis_title_x=(
element_text(color='#DA4D2E', size=12, face="bold", margin={'t': 15})
),
axis_title_y=(
element_text(color='#DA4D2E', size=12, face="bold", margin={'r': 15})
),
plot_title=(
element_text(hjust=0.5, size=14, face="bold", margin={'b': 33})
),
panel_spacing=0.5,
panel_background=element_rect(fill="white", color="white"),
plot_background=element_rect(fill="white"),
panel_grid_major_x=element_line(colour="white"),
panel_grid_major_y=element_line(colour="#cfa544", linetype="dashed"),
panel_grid_minor=element_blank(),
strip_text_x=(
element_text(size=12, hjust=0.5, color="#722502", face="bold")
)
)
def get_dish_urls():
"""Retrieve URLS for dishes under chicken, pork, and beef categories
Returns
-------
dish_urls: dict
Dictionary containing dish names as keys and URLs as values
"""
# Limit main ingredients to three types of meat
meats = ['chicken', 'pork', 'beef']
# Prepare dish URLs holder
dish_urls = {}
# Loop through each meat types
for meat in meats:
# Get HTML for the category page
response = requests.get(
f'https://panlasangpinoy.com/{meat}-recipes/'
)
soup = BeautifulSoup(response.text)
# Get number of pages
pagination = soup.select_one(
"div[class='archive-pagination pagination']"
)
last_page = pagination.select_one(
"li[class='pagination-omission'] + li"
)
num_pages = int(last_page.select_one("a").contents[1])
# Loop through each pages per meat type
for n in np.arange(1, num_pages+1):
response = requests.get(
f'https://panlasangpinoy.com/{meat}-recipes/page/{n}/'
)
soup = BeautifulSoup(response.text)
# Store dish URL
for dish in soup.select("a[class='entry-title-link']"):
dish_name = dish.contents[0]
dish_urls[dish_name] = dish['href']
print("Extraction of URLs done. Total number of dishes:", len(dish_urls))
return dish_urls
def get_ingredients(dish_urls):
"""Get ingredients for all dishes
Parameters
----------
dish_urls: dict
Dictionary containing dish names as keys and URLs as values
Returns
-------
dish_dict: dict
Dictionary containing dish names as keys and a dictionary of dish
information and ingredients as their values
"""
# Prepare dictionary
dish_dict = {}
# Get all dish information and ingredients per dish
for key, url in dish_urls.items():
print("Getting details for: ", key)
response = requests.get(url)
soup = BeautifulSoup(response.text)
dish_details = {}
try:
dish_details['course'] = soup.select_one(
"span[class='wprm-recipe-course "
"wprm-block-text-normal']"
).contents[0]
except AttributeError:
dish_details['course'] = np.nan
try:
dish_details['cuisine'] = soup.select_one(
"span[class='wprm-recipe-cuisine "
"wprm-block-text-normal']"
).contents[0]
except AttributeError:
dish_details['cusine'] = np.nan
try:
dish_details['prep_time'] = soup.select_one(
"span[class='wprm-recipe-details-label wprm-block-text-bold "
"wprm-recipe-time-label wprm-recipe-prep-time-label'] + span"
).text
except AttributeError:
dish_details['prep_time'] = np.nan
try:
dish_details['cook_time'] = soup.select_one(
"span[class='wprm-recipe-details-label wprm-block-text-bold "
"wprm-recipe-time-label wprm-recipe-cook-time-label'] + span"
).text
except AttributeError:
dish_details['cook_time'] = np.nan
try:
dish_details['total_time'] = soup.select_one(
"span[class='wprm-recipe-details-label wprm-block-text-bold "
"wprm-recipe-time-label wprm-recipe-total-time-label'] + span"
).text
except AttributeError:
dish_details['total_time'] = np.nan
try:
servings = soup.select_one(
"span[class='wprm-recipe-details-label wprm-block-text-bold "
"wprm-recipe-servings-label']"
).next_sibling.text
dish_details['servings'] = int(re.findall(r'\d+', servings)[0])
except AttributeError:
dish_details['servings'] = np.nan
try:
dish_details['calories'] = soup.select_one(
"span[class='wprm-recipe-nutrition-with-unit']"
).text
except AttributeError:
dish_details['calories'] = np.nan
# Loop through all ingredients
for ing in soup.select("li[class='wprm-recipe-ingredient']"):
ing_name = ing.select_one(
"span[class='wprm-recipe-ingredient-name']"
).text
try:
str_amount = ing.select_one(
"span[class='wprm-recipe-ingredient-amount']"
).text
str_unit = ing.select_one(
"span[class='wprm-recipe-ingredient-unit']"
).text
# Determine total numerical weight
dish_details[ing_name] = get_weight(str_amount, str_unit)
except AttributeError:
# Attempt to extract amount and unit from line
dish_details[ing_name] = get_weight(ing_name, ing_name)
dish_dict[key] = dish_details
return dish_dict
def get_weight(str_amount, str_weight):
"""Transform different measurements into base units
Parameters
----------
str_amount: str
Numeric amount
str_weight: str
Unit of measurement
Returns
-------
weight: int
Total calculated weight
"""
amount = 0
div_weight = 1
# Set dictionary of vulgar fraction unicode characters
vul_frac = {
'¼':1/4,
'½':.5,
'¾':3/4,
'⅐':1/7,
'⅑':1/9,
'⅒':.1,
'⅓':1/3,
'⅔':2/3,
'⅕':.2,
'⅖':.4,
'⅗':.6,
'⅘':.8,
'⅙':1/6,
'⅚':5/6,
'⅛':1/8,
'⅜':3/8,
'⅝':5/8,
'⅞':7/8
}
# Check for vulgar fraction characters
for vul_char, frac in vul_frac.items():
if str_amount.find(vul_char) != -1:
amount += frac
break
# Check for typed fractions
frac_list = re.findall(r'(\d+)/(\d+)', str_amount)
if frac_list != []:
num_frac, den_frac = frac_list[0]
amount += int(num_frac) / int(den_frac)
# Check for whole number
whole_num_list = re.findall(r'(?<!/)(\d+)(?!/)', str_amount)
if whole_num_list != []:
amount += int(whole_num_list[0])
# Add one if amount is detected
if amount == 0:
amount += 1
# Set unit-value mappings
unit_list = {
'lb': 0.00220462,
'bunch': 0.1,
'liter': 0.236588,
'quart': 0.25,
'pinch': 0.355625,
'pint': 0.5,
'oz': 8,
'ounce': 8,
'thumb': 9.5,
'tablespoon': 16,
'tbsp': 16,
'tsp': 48,
'ml': 236.588
}
# Check for units
for unit_name, unit_val in unit_list.items():
if str_weight.find(unit_name) > -1:
div_weight = unit_val
# Compute weight
weight = amount / div_weight
return weight
def get_nutrition(dish_urls):
"""Get nutritional value for all dishes
Parameters
----------
dish_urls: dict
Dictionary containing dish names as keys and URLs as values
Returns
-------
nut_dict: dict
Dictionary containing dish names as keys and a dictionary of nutrients
as their values
"""
# Prepare dictionary
nut_dict = {}
# Retrieve all nutrients per dish
for key, url in dish_urls.items():
print("Getting details for: ", key)
response = requests.get(url)
soup = BeautifulSoup(response.text)
nut_details = {}
# Get servings
try:
servings = soup.select_one(
"span[class='wprm-recipe-details-label wprm-block-text-bold "
"wprm-recipe-servings-label']"
).next_sibling.text
nut_details['Serving'] = int(re.findall(r'\d+', servings)[0])
except AttributeError:
nut_details['Serving'] = np.nan
# Loop through all nutrients
for nut in soup.select(
"span[class='wprm-nutrition-label-text-nutrition-container']"
):
nut_name = nut.select_one(
"span[class='wprm-nutrition-label-text-nutrition-label "
"wprm-block-text-normal']"
).text
nut_name = re.findall(r'(\w+):', nut_name)[0]
nut_details[nut_name] = nut.select_one(
"span[class='wprm-nutrition-label-text-nutrition-value']"
).text
nut_dict[key] = nut_details
return nut_dict
def prepare_df(dish_dict):
'''Create DataFrame out of dish dictionary
Parameters
----------
dish_dict: dict
Dictionary containing dish names as keys and a dictionary of dish
information and ingredients as their values
Returns
-------
df_new: DataFrame
DataFrame with rows representing dishes and columns representing
dish information and ingredients
'''
# Prepare DataFrame
df_ing = pd.DataFrame.from_dict(dish_dict).T
# Retain Filipino dishes
df_ing = df_ing[df_ing.cuisine.str.contains('Filipino', na=False)]
# Drop irrelevant columns
df_ing = df_ing.drop(columns=["course", "cuisine", "prep_time",
"cook_time", "total_time", "servings",
"calories"])
# Sort columns and drop empty columns
df_ing = df_ing.sort_index(axis=1)
df_ing = df_ing.dropna(axis=1, how='all')
df_ing.fillna(0)
# Prepare ingredient-keyword mappings
cat_ing_dict = {
'yeast': r'(yeast)',
'wrapper': r'(wrapper)',
'worcestershire_sauce': r'(worcestershire)',
'winged_bean': r'(winged bean)',
'vinegar': r'(vinegar)',
'turmeric': r'(tumeric)',
'tomato': r'(tomato)',
'tofu': r'(tofu)',
'toasted_rice_powder': r'(toasted rice powder)',
'taro': r'(taro)',
'sweet_potato': r'(sweet potato)',
'sugar': r'(sugar)',
'star_anise': r'(star anise)',
'squash': r'(kalabasa|squash)',
'soy_sauce': r'(soy sauce)',
'sinigang_mix': r'(sinigang)',
'shortening': r'(shortening)',
'sesame_oil': r'(sesame oil)',
'scallion': r'(scallion)',
'sayote': r'(sayote)',
'salted_egg': r'(salted egg)',
'safflower_oil': r'(safflower oil)',
'raisins': r'(raisins)',
'radish': r'(radish)',
'potato': r'(potato)',
'pork_insides': (
r'(pig’s liver|pig’s heart|pig’s small intestine|'
r'bung|pig cheeks|pig heart|pig kidney|pig stomach|'
r'pork ears|pork large intenstine|pork liver|small intestine)'
),
'pork_fat': r'(pork fat)',
'pork_stock': r'(pork broth|pork stock|pork cube)',
'pork_blood': r'(pork blood)',
'pork_and_beans': r'(pork and beans)',
'pie_crust': r'(pie)',
'pickle': r'(pickle|relish)',
'pepper_leaf': r'(pepper leaves)',
'pechay': r'(pechay)',
'peanut': r'(peanut)',
'patola': r'(patola)',
'parsley': r'(parsley)',
'paprika': r'(paprika)',
'papaya': r'(papaya)',
'oyster_sauce': r'(oyster)',
'onion': r'(onion)',
'olive_oil': r'(olive oil)',
'olive': r'(olive)',
'okra': r'(okra)',
'nutmeg': r'(nutmeg)',
'noodle': r'(noodle|pancit|sotanghon|misua|miswa)',
'mushroom': r'(mushroom)',
'munggo': r'(mung)',
'mirin': r'(mirin)',
'mayonnaise': r'(mayonnaise)',
'malunggay': r'(malunggay)',
'liver_spread': r'(liver)',
'lemongrass': r'(lemongrass)',
'leeks': r'(leeks)',
'lechon_sauce': r'(lechon)',
'kasubha': r'(kasubha)',
'kangkong': r'(spinach|kangkong)',
'jicama': r'(jicama)',
'jackfruit': r'(jackfruit)',
'ice': r'(ice)',
'hotdog': r'(hotdog)',
'hot_sauce': r'(hot)',
'honey': r'(honey)',
'hoisin_sauce': r'(hoisin)',
'green_pea': r'(green pea|pigeon pea)',
'green_bean': (
r'(green beans|sitaw|snake beans|string beans|snap pea|snow pea)'
),
'glutinous_rice': r'(glutinous rice)',
'ginger': r'(ginger)',
'ginataang_gulay_mix': r'(ginataang)',
'garlic': r'(garlic)',
'flour': r'(flour)',
'eggplant': r'(eggplant|talong)',
'egg': r'(egg)',
'curry_powder': r'(curry)',
'cucumber': r'(cucumber)',
'cream': r'(cream)',
'cooking_wine': r'(wine)',
'cooking_oil': r'(cooking oil|vegetable oil)',
'coconut_water': r'(coconut water)',
'coconut_milk': r'(coconut cream|coconut milk)',
'coconut_meat': r'(coconut meat)',
'clear_soda': r'(7-up|sprite|clear softdrink)',
'cinnamon': r'(cinnamon)',
'sausage': r'(chinese sauage|chorizo)',
'chicken_stock': r'(chicken broth|chicken cube)',
'chicharon': r'(chicharon)',
'cheese': r'(cheese)',
'celery': r'(celery)',
'carrot': r'(carrot)',
'canned_meat': r'(potted meat|luncheon meat)',
'calamansi': r'(calamansi|lemon|lime)',
'cabbage': r'(cabbage)',
'butter': r'(butter|margarine)',
'broccoli': r'(broccoli)',
'bread': r'(bread)',
'bok_choy': r'(bok choy|bokchoy)',
'black_soda': r'(coke|cola)',
'black_bean': r'(black bean)',
'beer': r'(beer)',
'beef_insides': (
r'(lard|cow|beef heart|beef kidney|beef large instestine|'
r'beef liver|beef neck bone|beef small intestine|bile|tripe|'
r'tongue|tripe|lengua)'
),
'beef_stock': (
r'(beef cube|beef bouillon|bulalo|beef broth|beef stock)'
),
'bay_leaf': r'(bay)',
'bamboo_shoots': r'(bamboo shoots)',
'baking_powder': r'(baking powder)',
'annatto': r'(annatto)',
'ampalaya': r'(ampalaya)',
'adobo_sauce': r'(adobo)',
'achiote': r'(achiote)',
'tomato_liquid': (
r'(ketchup|tomato sauce|tomato paste|spaghetti sauce)'
),
'banana_flower': r'(blossom)',
'pepper': r'(white pepper|black pepper|crushed pepper|peppercorn)',
'chili': (
r'(chili|pepper flakes|serrano pepper|sili|jalapeno|'
r'ghost pepper|green pepper)'
),
'bell_pepper': r'(bell pepper)',
'bagoong': r'(alamang|shrimp paste|balaw)',
'liquid_seasoning': r'(liquid seasoning|savorrich|marinade)',
'chickpea': r'(chick pea|garbanzos)',
'chicken_insides': r'(chicken gizzard|chicken hear|chicken liver)',
'cornstarch': r'(cornstarch)',
'corned_beef': r'(corned beef)',
'fish_sauce': r'(fish sauce)',
'pineapple_juice': r'(pineapple juice|juice from the canned tidbits)',
'shrimp_cube': r'(shrimp cube)',
'watermelon': r'(watermelon)',
'milk': r'(milk)',
'pea': r'(pea)',
'pasta': r'(spaghetti|macaroni)',
'shrimp': r'(shrimp)',
'pineapple': r'(pineapple)',
'water': r'(water)',
'salt': r'(salt)',
'rice': r'(rice|sinangag)',
'pork': r'(pork|pig|lechon)',
'corn': r'(corn)',
'chicken': r'(chicken)',
'beef': r'(beef|steak|oxtail|ox tail|sirloin|bistek)',
'banana': r'(banana|plantain)'
}
# Create new DataFrame
df_new = pd.DataFrame(index=df_ing.index)
# Loop through the categories
for cat_name, cat_regex in cat_ing_dict.items():
# Look for columns that contains the query
ing_filter = (
df_ing.columns.to_series()
.str.contains(cat_regex, case=False, regex=True)
)
# Filter columns that satisfy the query
filter_cols = ing_filter[ing_filter].index
print(f"Columns obtained for {cat_name}:", filter_cols)
# Add columns to the new DataFrame
df_new[cat_name] = df_ing[filter_cols].sum(axis=1)
# Drop columns to prevent reusing of ingredients
df_ing.drop(columns=filter_cols, inplace=True)
df_new = df_new.rename_axis('dish_name').reset_index()
return df_new
def prepare_nut_df(nut_dict, dish_idx):
'''Create DataFrame out of nutrition dictionary
Parameters
----------
nut_dict: dict
Dictionary containing dish names as keys and a dictionary of nutrients
as their values
dish_idx: list
Indeces from Ingredient Information DataFrame
Returns
-------
df_nut: DataFrame
DataFrame with rows representing dishes and columns representing
nutrients
'''
# Prepare DataFrame
df_nut = pd.DataFrame.from_dict(nut_dict).T
# Retain dishes from Ingredient Information DataFrame
df_nut = df_nut.loc[dish_idx]
df_nut.fillna(0, inplace=True)
df_nut = df_nut.rename_axis('dish_name').reset_index()
return df_nut
def export_sql(df_dish, df_nut):
"""Export DataFrame to SQL file"""
# Open connection to DB file
conn = sqlite3.connect('ulam_nut.db')
# Transform DataFrame into DB table
df_dish.to_sql('rekado', con=conn, if_exists='replace', index=False)
df_nut.to_sql('nutrition', con=conn, if_exists='replace', index=False)
# Close connection
conn.close()
def retrieve_data():
'''Get tables from database
Returns
-------
df_ing, df_nut: DataFrame
DataFrames representing ingredients and nutrional value, respectively
'''
with create_engine('sqlite:///ulam_nut.db').connect() as conn:
# Read ingredients table
df_ing = pd.read_sql("""
SELECT * FROM rekado
""", conn)
#Read nutrition table
df_nut = pd.read_sql("""
SELECT * FROM nutrition
""", conn)
return df_ing, df_nut
def drop_features(data, irrelevant_cols, skip_cols, p):
"""
Drop features given a variance threshold
Parameters
----------
data : dataframe
dataframe from which the columns will be dropped. May contain
a mix of numeric and categorical columns. By default, categorical
columns are label-encoded before getting the variance of the columns.
irrelevant_cols : list
list of column names. Column names in this list will be dropped
permanently.
skip_cols : list
list of column names. Column names in this list will be kept by
default and will not undergo variance thresholding.
p : float
variance threshold for dropping. A p = 0.01 would mean dropping
columns where 99% of the values are similar.
Returns
-------
dataframe in its original format, whose columns that did not
satisfy the p threshold were dropped
"""
orig_data = data.copy(deep=True)
if irrelevant_cols is not None:
data = orig_data.drop(irrelevant_cols, axis=1)
if skip_cols is not None:
data = data.drop(skip_cols, axis=1)
num_data = pd.DataFrame(data.select_dtypes(include=np.number))
cat_data = pd.DataFrame(data.select_dtypes(exclude=np.number))
if cat_data.shape[1] != 0:
cat_data = cat_data.apply(LabelEncoder().fit_transform)
transformed_data = pd.concat([num_data, cat_data], axis=1)
else:
transformed_data = num_data
# threshold=0.01 means dropping the column where 99% of values are similar.
thresh = VarianceThreshold(threshold=p)
thresh.fit_transform(transformed_data)
ind = thresh.get_support(indices=True)
cols_kept = list(transformed_data.columns[ind])
cols_dropped = list(set(transformed_data.columns)-set(cols_kept))
if skip_cols is not None:
return orig_data[skip_cols + cols_kept]
return orig_data[cols_kept]
def fix_dtypes(data):
"""Fix datatypes of the dataframe
Parameters
----------
data : dataframe
dataframe from which the columns will be appropriately
converted into their correct data types. May contain a mix of numeric
and categorical columns.
"""
df = data.copy(deep=True)
df = df.convert_dtypes(
infer_objects=False, convert_string=False, convert_floating=True
)
for i in df.columns:
if '_dt' in i:
df[i] = pd.to_datetime(df[i], errors='coerce')
return df
def manual_fix_dtypes(data, float_cols):
"""Fix datatypes of the dataframe
Parameters
----------
data : dataframe
dataframe from which the columns will be appropriately
converted into their correct data types. May contain a mix of numeric
and categorical columns.
float_cols : list
List of columns to be converted to float
"""
df = data.copy(deep=True)
df[float_cols] = df[float_cols].apply(pd.to_numeric, errors='coerce')
for i in df.columns:
if '_dt' in i:
df[i] = pd.to_datetime(df[i], errors='coerce')
return df
def truncated_svd(X, thresh=0.90):
"""Perform singular value decomposition on a design matrix X
Parameters
----------
X : array
Matrix of numbers to decompose.
thresh : float
A number between 0 to 1 that serves as the cut-off for choosing
the number of SV components to keep.
"""
q, s, p = np.linalg.svd(X, full_matrices=True)
Q = q
S = np.diag(s)
P = p.T
NSSD = (s / np.sqrt(np.sum(s**2)))**2
ind = bisect.bisect(NSSD.cumsum(), thresh) + 1
return Q, S, P, NSSD
def project_svd(q, s, k):
"""Project the design matrix on to the first k singular vectors
Parameters
----------
q : array
Array of SV loadings.
s : array
Array og variance explained.
k : int
Number of components to display.
"""
return q[:, :k].dot(s[:k, :k])
def plot_svd_ulam(data, num_comp, num_ing, fill_fn, manual_fill_values):
"""Plot the SV components
Plots SV components but allows the user to zoom in on certain loadings
for easier interpretation.
Parameters
----------
data : dataframe
pandas dataframe whose columns are the are SV components. The rows
should contain the loadings for each feature.
zoom_on : ['dominant', 'close to zero']
The 'dominant' shows only the top num_ing and bottom num_ing of the
features based on their loadings, whereas 'close to zero' shows only
the num_ing features whose loadings are close to 0.
num_comp : int
The order or number of SV component to plot.
num_ing : int
Number of features in the SV to show. Affects the zoom_on parameter.
fill_fn : function
Function that maps the loading value to a color.
manual_fill_values : list
List of HEX colors found in fill_fn.
"""
pc = data.iloc[:, (num_comp-1)].reset_index()
pc.columns = ['ing', 'loading']
pc['abs_loading'] = np.abs(pc['loading'])
pc = pc.sort_values('abs_loading', ascending=False).head(num_ing)
pc['ing'] = pc['ing'].str.replace('_', ' ').str.title()
pc['ing'] = pd.Categorical(pc['ing'], categories=pc['ing'], ordered=True)
pc['ing_color'] = fill_fn(pc['loading'])
p = (
ggplot(pc, aes(x='ing', y='loading')) +
geom_bar(aes(fill='ing_color'), stat='identity', show_legend=False) +
scale_fill_manual(values=manual_fill_values) +
xlab('') +
ylab('') +
proj_theme
)
return p
def plot_svd_zoomed(data, zoom_on,
num_comp, num_ing,
fill_fn, manual_fill_values):
"""Plot the SV components
Plots SV components but allows the user to zoom in on certain loadings
for easier interpretation.
Parameters
----------
data : dataframe
pandas dataframe whose columns are the are SV components. The rows
should contain the loadings for each feature.
zoom_on : ['dominant', 'close to zero']
The 'dominant' shows only the top num_ing and bottom num_ing of the
features based on their loadings, whereas 'close to zero' shows only
the num_ing features whose loadings are close to 0.
num_comp : int
The order or number of SV component to plot.
num_ing : int
Number of features in the SV to show. Affects the zoom_on parameter.
fill_fn : function
Function that maps the loading value to a color.
manual_fill_values : list
List of HEX colors found in fill_fn.
"""
pc = data.iloc[:, (num_comp-1)].sort_values().reset_index()
pc.columns = ['ing', 'loading']
if zoom_on == 'dominant':
inds = (
list(range(0, num_ing)) +
list(range(pc.shape[0]-num_ing, pc.shape[0]))
)
pc = pc[pc.index.isin(inds)]
elif zoom_on == 'close to zero':
min_dist = min(np.abs(pc['loading'] - 0))
min_ind = pc[np.abs(pc['loading']) == min_dist].index[0]
inds = (
list(range(min_ind-num_ing, min_ind-5)) +
list(range(min_ind+5, min_ind+num_ing))
)
pc = pc[pc.index.isin(inds)]
pc['ing'] = pc['ing'].str.replace('_', ' ').str.title()
pc['ing'] = pd.Categorical(pc['ing'], categories=pc['ing'], ordered=True)
pc['ing_color'] = fill_fn(pc['loading'])
p = (
ggplot(pc, aes(x='ing', y='loading')) +
geom_bar(aes(fill='ing_color'), stat='identity', show_legend=False) +
scale_fill_manual(values=manual_fill_values) +
coord_flip() +
xlab('') +
ylab('') +
proj_theme
)
p.save('SV'+str(num_comp)+'_'+zoom_on+'.png', width=8, height=6)
def final_clustering(transformed_arr,
orig_df,
method,
threshold,
plot_threshold,
break_biggest_cluster=False,
link_colors=['#5594BA',
'#EFC564',
'#DA4D2E',
'#722502',
'#EFC564',
'#B3C55A']):
"""Perform modified hierarchical clustering
Performs hierarchical clustering on the given matrix. Allows the user to
break the biggest clusters so they do not have to perform clustering
again.
Parameters
----------
transformed_arr : array
Matrix to be placed in the clustering algorithm
orig_df : dataframe
Dataframe to append the predicted cluster groups to.
method : string
The method parameter of heirarchy.linkage.
threshold : float
Threshold to determine the number of clusters.
plot_threshold : float
Threshold to fix the dendrogram. For plotting purposes only.
break_biggest_cluster : bool
Determines whether the biggest cluster will be further broken down.
link_colors : list
List of HEX colors for the dendrogram plot.
"""
Z = hierarchy.linkage(
transformed_arr, method=method, optimal_ordering=True
)
hierarchy.set_link_color_palette(link_colors)
y_pred = fcluster(Z, t=threshold, criterion='distance')
if break_biggest_cluster:
X_cluster = orig_df.copy()
X_cluster['cluster_no'] = y_pred
get_big_cluster = X_cluster.groupby(['cluster_no']).size().idxmax()
X_1 = (
X_cluster[X_cluster['cluster_no'] == get_big_cluster]
.drop(columns=['cluster_no'])
)
Z_1 = linkage(X_1, method='ward', optimal_ordering=True)
Z = Z_1
fig, ax = plt.subplots(figsize=(8, 5), dpi=100)
fig.patch.set_facecolor('white')
fig.patch.set_alpha(0.6)
ax.patch.set_facecolor('white')
ax.patch.set_alpha(0.0)
ax.spines['bottom'].set_color('#722502')
ax.spines['top'].set_color('#722502')
ax.spines['right'].set_color('#722502')
ax.spines['left'].set_color('#722502')
ax.tick_params(axis='x', colors='#722502')
ax.tick_params(axis='y', colors='#722502')
ax.set_ylabel(r'$\Delta$')
dn = dendrogram(Z, ax=ax, p=plot_threshold, truncate_mode='level')
return Z
def plotly_clusters(data, y_ref, x_ref, c_map):
"""Plot the nutrition clustering results on the ingredient SV components
Projects the dishes on the ingredient SV components and colors them by
their nutrition clustering results. This allows the user to associate
the natural tendencies of the dishes' ingredients and to the
natural clustering of the dishes' nutritional content.
Parameters
----------
data : dataframe
pandas dataframe consisting of the ingredient SVs, original (scaled)
nutrition information, along with the cluster numbers, cluster names, and
the desired cluster colors.
y_ref : float
y intercept to be plotted.
x_ref : float
x intercept to be plotted.
c_map : dict
Dictionary that maps the cluster names to a HEX color.
"""
fig = px.scatter(data.sort_values('cluster_names'),
x='SV2',
y='SV5',
color='cluster_names',
hover_data=['dish_name'],
labels={
'SV2': '<b>Meat and Vegetable Range</b>',
'SV5': '<b>Flavor Range</b>',
'cluster_names': '<b>Cluster</b>',
'dish_name': '<b>Dish Name</b>'},
color_discrete_map=cmap,
width=1000, height=800)
fig.update_traces(marker={'size': 14})
fig.for_each_trace(
lambda t: t.update(textfont_color='#99D072', textposition='top right')
)
fig.add_shape(
yref='y',
y0=y_ref,
y1=y_ref, # adding a horizontal line at Y = 1
xref='paper',
x0=0,
x1=1,
line=dict(
color='rgba(114, 37, 2, 0.5)',
width=2.5)
)
fig.add_shape(
type='line',
yref='paper',
y0=0,
y1=1,
xref='x',
x0=x_ref,
x1=x_ref,
line=dict(
color='rgba(114, 37, 2, 0.5)',
width=2.5)
)
fig.update_layout(
title={
'xanchor': 'center',
'yanchor': 'top',
'x': 0.5},
margin=dict(l=0, r=0, t=0, b=0),
font={'size': 20, 'color': '#722502'},
plot_bgcolor='white',
xaxis_range=[-1, 1],
yaxis_range=[-1, 1],
paper_bgcolor='white'
)
fig.update_xaxes(
showline=True, linewidth=2, linecolor='#722502', gridcolor='#F7E2B1'
)
fig.update_yaxes(
showline=True, linewidth=2, linecolor='#722502', gridcolor='#F7E2B1'
)
fig.show()
def plot_nutrients(data, cluster_name):
"""Plot the nutrient contents of each cluster
Plots the median nutritient contents of the cluster by nutrient kind.
Parameters
----------
data : dataframe
pandas dataframe consisting of the median nutrient content by cluster
name.
cluster_name : str
Cluster name whose nutrient contents are to be plotted.
"""
cluster_bar_df = data.loc[cluster_name].reset_index()
cluster_bar_df.columns = ['Nutrient', 'Value']
cluster_bar_df['Value'] = (
np.where(
cluster_bar_df['Nutrient'].isin(['Vitamin A']),
cluster_bar_df['Value']*0.6/1000,
cluster_bar_df['Value']
)
)
cluster_bar_df['Nutrient'] = (
pd.Categorical(
cluster_bar_df['Nutrient'],
categories=['Carbohydrates', 'Sugar', 'Fiber',
'Protein', 'Cholesterol', 'Fat',
'Vitamin A', 'Vitamin C',
'Sodium', 'Potassium', 'Calcium', 'Iron'],
ordered=True
)
)
def custom_names(x):
"""Custom color for the plot at hand"""
# High Cholesterol, High Sodium
if x in ['Sugar', 'Fiber', 'Carbohydrates']:
return 'Carbs (g)'
# High Protein & Less Fat, High Vitamins & Fiber
elif x in ['Fat', 'Cholesterol']:
return 'Lipids (mg)'
# Bland and low vitamins and minerals
elif x in ['Protein']:
return 'Proteins (g)'
# Bland but high vitamins and minerals
elif x in ['Vitamin A', 'Vitamin C']:
return 'Vitamins (mg)'
elif x in ['Iron', 'Sodium', 'Calcium', 'Potassium']:
return 'Minerals (g)'
func = np.vectorize(custom_names)
cluster_bar_df['Nutrient Group'] = func(cluster_bar_df['Nutrient'])
cluster_bar_df['Nutrient Group'] = (
pd.Categorical(
cluster_bar_df['Nutrient Group'],
categories=['Carbs (g)', 'Proteins (g)', 'Lipids (mg)',
'Vitamins (mg)', 'Minerals (g)'],
ordered=True
)
)
def custom_colors(x):
"""Custom color for the plot at hand"""
# High Cholesterol, High Sodium
if x in ['Carbohydrates', 'Sugar', 'Fiber']:
return '#EFC564'
# High Protein & Less Fat, High Vitamins & Fiber
elif x in ['Fat', 'Cholesterol']:
return '#DA4D2E'
# Bland and low vitamins and minerals
elif x in ['Protein']:
return '#722502'
# Bland but high vitamins and minerals
elif x in ['Vitamin A', 'Vitamin C']:
return '#F4903E'
elif x in ['Calcium', 'Potassium', 'Sodium', 'Iron']:
return '#8D9F38'
func = np.vectorize(custom_colors)
cluster_bar_df['nutrient_colors'] = func(cluster_bar_df['Nutrient'])
p = (
ggplot(cluster_bar_df, aes(x='Nutrient', y='Value')) +
geom_bar(
aes(fill='Nutrient Group'),
stat='identity',
width=0.6,
position='dodge',
show_legend=False
) +
scale_fill_manual(
values=['#EFC564', '#722502', '#DA4D2E', '#F4903E', '#8D9F38']
) +
coord_flip() +
facet_wrap('Nutrient Group', scales='free', ncol=1) +
xlab('') +
ylab('') +
geom_blank(aes(y=100)) +
proj_theme2
)
p.save(cluster_name+'.png', width=5, height=10)
The combination of flavors and ingredients in Filipino cuisine has come a long way: from being a brainchild of Western and Asian cooking techniques into becoming a distinct style that we Filipinos love and appreciate to this very date. However, this poses a cliché question: are the things we love truly good for us? With the recent emergence of health concerns, Filipinos are becoming more careful, especially with regards to the food they eat.
This study aims to discover the groupings of Filipino dishes according to their nutritional value and see how the dishes’ ingredients contribute to such groupings. Given this overarching problem, this study used two main data sources: the Ingredient Dataset and the Nutritional Value Dataset which were scraped and compiled from the Panlasang Pinoy website. The ingredients list was used to identify the natural combinations of Filipino ingredients, while information about the dishes' nutritional value was used in creating the nutrition-based clusters. Out of all the clustering methods explored, Ward's provided the best visual and domain knowledge interpretation while still retaining parsimony. Using this methodology, this study offers two sets of findings: (1) the nutrient-based clusters of Filipino dishes and their corresponding interpretation, and (2) the nutrient-based clusters projected on the ingredient space. This allows us to not only understand the underlying nutritional trends in our dishes but also identify healthier alternative ingredients for otherwise unhealthy dishes.
The team discovered four nutrient-based clusters of Filipino dishes: a Typical Filipino Meal cluster, characterized by strong taste and absence of vegetables that led to a deficiency in vitamins and minerals; an Upgraded Filipino Meal cluster, which contains improved amounts of vitamins that is driven by the presence of tomatoes; a Super Filipino Meal cluster, which is considered to be the healthiest out of all clusters due to the recognizable amount of fruits and vegetables, and a Fatty and Salty Filipino Meal, characterized by high amounts of sodium and cholesterol. When projected on the ingredient components, almost all clusters are clumped together in the sweet and salty flavor range and are spread across the meat and vegetable range. This suggests that regardless of the combination of meat and vegetables found in a Filipino dish, as long as it is in the sweet and salty flavor range, the dish will almost always fall under the Typical Filipino Meal, the Upgraded Filipino Meal, or the Fatty and Salty Filipino Meal. The more the dishes make use of pork and garlic combination instead of the lean meat and crops combination, the more likely it is to belong to the Fatty and Salty cluster. However, if this pork and garlic combination is in a nutty sauce that is accompanied by vitamin-enriched vegetables, then they are likely to belong to the Super Filipino Meal.
Aside from the intent of providing use cases that utilize the insights from ingredient-based clusters, the team suggests having a balanced focus on nutrition as well. With nutrition-based clustering, business owners and passionate chefs can craft an entirely different menu that gives priority to nutrition or find innovative ways to turn unhealthy dishes into superfoods. For future studies, the team recommends looking at various recipes from other cookbooks and restaurants that could help provide better results.
We, Filipinos, love food; there is no doubt about this. The richness of flavor in Filipino Cuisine is one of a kind and truly phenomenal. However, can we say the same about its nutritional value? Is the world-famous Adobo healthy? What about our Mechado or Afritada?
Contrary to most of our Southeast Asian neighbors, Filipino dishes usually shy away from using herbs and are inclined to use unique ingredients like offal, gizzard, and chicken intestines. Apart from having a variety of ingredients, the style of making Filipino dishes pulls a lot from Spanish and American influences, as well as Asian influences such as Chinese and Indonesian [1]. Amidst the boldness and creativity found in our dishes, we wonder if its nutritional value is as commendable and as its flavors.
As the years have gone by, people are becoming more health-conscious than ever, especially now because of the ongoing pandemic [2]. While practicing consciousness about our health is easy whenever we go to a restaurant or call for food delivery, there is little literature yet about how healthy, in general, homecooked Filipino dishes are. Variations of a Filipino dish grow multiplicatively, while the Filipino mass remains ignorant of the source's nutritional content. The researchers suspect that because of this, the orientation for innovation and reinvention in the Filipino cuisine generally pointed towards experimenting with flavors and textures but not towards finding healthier alternatives.
This challenge served as the group's inspiration to study the nutritional patterns of popular Filipino dishes as the first step towards helping the Filipinos be more intentional with their cooking and eating choices while still enjoying good food. In this study, we will uncover not only insights about the dishes' overall nutrient content, but also identify ingredient combinations that are likely to be healthier, and thus could serve as a starting point in creating healthier dish variants. The researchers hope that this study will encourage Filipinos to cook and consume healthier menus by identifying unhealthy dishes while providing alternatives, and re-orienting innovation and reinvention of traditional Filipino recipes towards a healthier track.
How do the Filipino dishes look like from a nutrition-based standpoint?
The study used the recipes from Panlasang Pinoy; a Filipino food blog created by Mr. Vanjo Merano to showcase his passion for cooking and promote Filipino Cuisine to the rest of the world. Each Filipino dish page is presented in a blog-like fashion, complete with a narrative, cooking tips, and of course the recipe itself. The recipe contains brief information about the dish, cooking utensils and equipment needed, ingredients, and instructions (Figure 1). Some recipes are also accompanied by their estimated nutritional value based on the said ingredients. Compared to other sources, the units of both the ingredients and nutritional content in this website are relatively more standardized.
Only the relevant information from the recipe, such as the serving size, ingredients list, and nutrition content of the recipe was scraped. The ingredients list was used to identify the natural combinations of Filipino ingredients, while the nutrition content was used in creating the nutrition-based clusters. While information about the ingredients was not used in clustering per se, they were vital in interpreting the clustering results, particularly in understanding whether a combination of ingredients results in healthy or unhealthy dishes.
The ingredient lists for the different Filipino dishes were compiled into the Ingredient Dataset. Its rows corresponds to a dish, while each of its columns correspond to an ingredient. Ingredients that do not appear in the dish were assigned a value of 0. Further, ingredients that represent almost the same flavor and texture were collapsed (e.g., black pepper and peppercorn were collapsed into pepper). Table 1 below lists down all the ingredients that can be found in the dataset, grouped according to their kind.
beef_insides |
ampalaya |
beef_stock |
achiote |
beef |
bamboo_shoots |
beer |
adobo_sauce |
canned_meat |
banana |
black_soda |
annatto |
chicharon |
bell_pepper |
chicken_stock |
bagoong |
chicken_insides |
black_bean |
clear_soda |
baking_powder |
chicken |
bok_choy |
coconut_milk |
banana_flower |
corned_beef |
bread |
ice |
bay_leaf |
egg |
broccoli |
milk |
butter |
hotdog |
cabbage |
pineapple_juice |
calamansi |
pork_and_beans |
carrot |
pork_stock |
cheese |
pork_blood |
celery |
tomato_liquid |
chili |
pork_fat |
chickpea |
water |
cinnamon |
pork_insides |
coconut_meat |
coconut_water |
|
pork |
corn |
cooking_oil |
|
salted_egg |
cucumber |
cooking_wine |
|
sausage |
eggplant |
cornstarch |
|
shrimp |
glutinous_rice |
cream |
|
green_bean |
curry_powder |
||
green_pea |
fish_sauce |
||
jackfruit |
flour |
||
jicama |
garlic |
||
kangkong |
ginataang_gulay_mix |
||
malunggay |
ginger |
||
munggo |
hoisin_sauce |
||
mushroom |
honey |
||
noodle |
hot_sauce |
||
okra |
kasubha |
||
olive |
lechon_sauce |
||
papaya |
leeks |
||
pasta |
lemongrass |
||
patola |
liquid_seasoning |
||
pea |
liver_spread |
||
peanut |
mayonnaise |
||
pechay |
mirin |
||
pepper_leaf |
nutmeg |
||
pickle |
olive_oil |
||
pie_crust |
onion |
||
pineapple |
oyster_sauce |
||
potato |
paprika |
||
radish |
parsley |
||
raisins |
pepper |
||
rice |
safflower_oil |
||
sayote |
salt |
||
squash |
scallion |
||
sweet_potato |
sesame_oil |
||
taro |
shortening |
||
tofu |
shrimp_cube |
||
tomato |
sinigang_mix |
||
watermelon |
soy_sauce |
||
winged_bean |
star_anise |
||
wrapper |
sugar |
||
toasted_rice_powder |
|||
turmeric |
|||
vinegar |
|||
worcestershire_sauce |
|||
yeast |
Finally, to allow better comparability and interpretation, the unit of measurements underwent a two-step scaling method (see Data Preparation for more details). In this way, all dry ingredients in the dataset are measured in grams, all wet ingredients are measured in cups, and all ingredients that come in packs and bundles (e.g., dozen) are measured in an individual count of pieces.
The nutritional content for the different Filipino dishes was compiled into the Nutritional Value Dataset. Its rows correspond to a dish, while each of its columns corresponds to the recipe's suggested serving size, along with its different nutrient contents: carbohydrates, fiber, sugar, fat, cholesterol, protein, vitamin A, vitamin C, calcium, and potassium (Table 2). Since all units of measurement for the nutrients are the same throughout the website, only a minimal standardization step was performed (see Data Preparation for more details).
Serving |
Number of servings in the recipe |
Carbohydrates |
Total amount of carbohydrates in grams (g) |
Protein |
Total amount of protein in grams (g) |
Fat |
Total amount of fat in grams (g) |
Cholesterol |
Total amoutn of cholesterol in milligrams (mg) |
Sodium |
Total amount of sodium in milligrams (mg) |
Potassium |
Total amount of potassium in milligrams (mg) |
Fiber |
Total amount of fiber in grams (g) |
Sugar |
Total amount of sugar in grams (g) |
A |
Total amount of Vitamin A in international unit (IU) |
C |
Total amount of Vitamin C in milligrams (mg) |
Calcium |
Total amount of calcium in milligrams (mg) |
Iron |
Total amount of iron in milligrams (mg) |
The team created an SQLite database, named ulam_nut.db, consisting of two tables that have the relevant and necessary to answer all the study's research questions. The tables are created according to the information they contain: one containing ingredient information, and the other containing nutritional value. Table 3 below summarizes the contents of the database, followed by a preview of the mentioned tables.
| Total number of Filipino dishes Filipino dishes | 282 dishes |
| Total number of ingredients | 137 ingredients |
| Total number of nutrients | 13 nutrients |
df_ing, df_nut = retrieve_data()
df_ing.head()
df_nut.head()
This study is not without limitations and assumptions. The researchers acknowledge that there are different interpretations for a single Filipino dish and that the ones from Panlasang Pinoy are not to be considered as the gold standard. With that said, the recipes in this study came ultimately rely on the interpretation of the website's owner and contributors. The researchers also have to impose an assumption that the nutritional value of the dish found on Panlasang Pinoy is correct, and that the conversion multipliers used in correspondingly scaling all ingredients into grams (dry), cups (wet), and pieces (bundles) are correct.
As we want to allow the algorithms to learn the underlying patterns beyond merely the protein types, we only included considered pork, chicken, and beef dishes. Further, we did not inform the algorithm about the techniques used in the dishes (e.g., boiled, braised, etc.), sources of origin (whether cultural or location-based), and whether the dish is a variant of an existing one. While these are valuable information, these are too difficult to define and find proxies for given the limited time.
During the scrapping and preparation of the Ingredient Dataset and Nutritional Value Dataset, the following steps have been done:
pork_insides |
"pig’s liver", "pig’s heart", "pig’s small intestine", "bung", "pig cheeks", "pig heart", "pig kidney", "pig stomach", "pork ears", "pork large intenstine", "pork liver", "small intestine" |
pork_stock |
"pork broth", "pork stock", "pork cube" |
noodle |
"noodle", "pancit", "sotanghon", "misua", "miswa" |
kangkong |
"spinach", "kangkong" |
green_pea |
"green pea", "pigeon pea" |
green_bean |
"green beans", "sitaw", "snake beans", "string beans", "snap pea", "snow pea" |
eggplant |
"eggplant", "talong" |
cooking_oil |
"cooking oil", "vegetable oil" |
coconut_milk |
"coconut cream", "coconut milk" |
clear_soda |
"7-up", "sprite", "clear softdrink" |
sausage |
"chinese sauage", "chorizo" |
chicken_stock |
"chicken broth", "chicken cube" |
canned_meat |
"potted meat", "luncheon meat" |
calamansi |
"calamansi", "lemon", "lime" |
butter |
"butter", "margarine" |
bok_choy |
"bok choy", "bokchoy" |
black_soda |
"coke", "cola" |
beef_insides |
"lard", "cow", "beef heart", "beef kidney", "beef large instestine", "beef liver", "beef neck bone", "beef small intestine", "bile", "tripe", "tongue", "tripe", "lengua" |
beef_stock |
"beef cube", "beef bouillon", "bulalo", "beef broth", "beef stock" |
tomato_liquid |
"ketchup", "tomato sauce", "tomato paste", "spaghetti sauce" |
pepper |
"white pepper", "black pepper", "crushed pepper", "peppercorn" |
chili |
"chili", "pepper flakes", "serrano pepper", "sili", "jalapeno", "ghost pepper", "green pepper" |
bagoong |
"alamang", "shrimp paste", "balaw" |
liquid_seasoning |
"liquid seasoning", "savorrich", "marinade" |
chickpea |
"chick pea", "garbanzos" |
chicken_insides |
"chicken gizzard", "chicken hear", "chicken liver" |
pineapple_juice |
"pineapple juice", "juice from the canned tidbits" |
pasta |
"spaghetti", "macaroni" |
rice |
"rice", "sinangag" |
pork |
"pork", "pig", "lechon" |
beef |
"beef", "steak", "oxtail", "ox tail", "sirloin", "bistek" |
banana |
"banana", "plantain" |
pea, leeks, cucumber, turmeric, and glutinous_rice.| pounds(lbs.) | 0.00220462 |
| pinch | 0.355625 |
| thumb | 9.5 |
| liter(L) | 0.236588 |
| quart(qt) | 0.25 |
| pint | 0.5 |
| ounce(oz.) | 8 |
| tablespoon(tbsp.) | 16 |
| teaspoon(tsp.) | 48 |
| milliliter(ml) | 236.588 |
| pack | 1 |
| can | 1 |
| bundle | 5 |
| dozen | 12 |
SV component 2 and SV component 5 were used later in the Descriptive Analysis where the clusters are interpreted. | Carbs | carbohydrates, sugar, fiber |
grams (g) |
| Proteins | protein |
grams (g) |
| Lipids | cholesterol, fat |
milligrams (mg) |
| Vitamins | vitamin A |
international units (IU) |
| Vitamins | vitamin C |
milligrams (mg) |
| Minerals | sodium, potassium, calcium, iron |
grams (g) |
| IU (Vitamin A; beta-carotine) | 0.0003 |
Since the number of features in the Nutritional Value Dataset is only 12 to start with, no dimensionality reduction was applied. Sensitivity analysis was performed to verify the influence of performing SVD or principal component analysis, but no significant changes in the clustering results were found. However, for any plotting requirements that made use of the Nutritional Value Dataset (e.g., plotting clustering results), SVD was performed.
Four clustering methods were explored, two representative-based and two hierarchical algorithms: k-Means, k-Medians, and hierarchical clustering using Ward's method and complete linkage. For k-Means and k-Medians, k values from 2 to 11 were explored, while for the hierarchical methods, sensitivity analysis on the dendrogram cut-off points was performed. Whenever the hierarchical clustering methods produced a big cluster that could still be further re-clustered, a separate dendrogram cut-off point for that cluster was used (Figures 2 and 3).
In choosing the best method, the researchers considered the following factors:
Out of the considered methods, Ward's method satisfied all three requirements. This study only contains the codes for Ward's clustering. For the complete exploration codes, please see UlamNut_Clustering (testing).ipynb.
fixed_df = manual_fix_dtypes(
df_nut,
float_cols=['Calories', 'Carbohydrates', 'Protein', 'Fat',
'Cholesterol', 'Sodium', 'Potassium', 'Fiber',
'Sugar', 'A', 'C', 'Calcium', 'Iron', 'Serving'])
fixed_df = fixed_df[fixed_df['Serving'] > 0]
fixed_df = fixed_df[fixed_df['Calories'] > 0]
fixed_df.iloc[:, 1:] = fixed_df.iloc[:, 1:].div(fixed_df['Serving'], axis=0)
dropped_df_orig = drop_features(
fixed_df, irrelevant_cols=['Serving', 'Calories'], skip_cols=None, p=0
)
dropped_df = dropped_df_orig.drop('dish_name', axis=1)
X = np.array(dropped_df, dtype=float)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
fixed_df_ing = fix_dtypes(df_ing)
dropped_df_orig_ing = drop_features(
df_ing, irrelevant_cols=None, skip_cols=None, p=0
)
dropped_df_ing = dropped_df_orig_ing.drop('dish_name', axis=1)
X_ing = np.array(dropped_df_ing, dtype=float)
scaler = MinMaxScaler()
X_ing = scaler.fit_transform(X_ing)
q, s, p, nssd = truncated_svd(X_ing)
X_new_ing = project_svd(q, s, 60)
feature_names = dropped_df_ing.columns
weights_df_ing = pd.DataFrame(p, index=feature_names)
weights_df_ing.columns = ['SV'+str(c+1) for c in weights_df_ing.columns]
weights_df_ing = weights_df_ing.iloc[:, :60]
Center(""" <style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style> """)
Z = final_clustering(
X,
dropped_df_orig.drop(columns='dish_name'),
'ward',
2.5,
5,
False,
link_colors=['#DA4D2E', '#8D9F38']
)
y_pred = fcluster(Z, t=2.5, criterion='distance')
X_cluster = dropped_df_orig.assign(cluster_no=y_pred)
2.5)Z_1 = final_clustering(
X, dropped_df_orig.drop(columns='dish_name'),
'ward',
2.5,
5,
break_biggest_cluster=True,
link_colors=['#722502', '#CC9500', '#1F77B4', '#1F77B4']
)
y_pred_1 = fcluster(Z_1, t=4000, criterion='distance')
get_big_cluster = X_cluster.groupby(['cluster_no']).size().idxmax()
X_cluster_1 = (
X_cluster[X_cluster['cluster_no'] == get_big_cluster]
.drop(columns='cluster_no')
)
X_cluster_1['cluster_no'] = y_pred_1 + 3
X_cluster.loc[X_cluster_1.index] = X_cluster_1
X_cluster['cluster_no'] = X_cluster['cluster_no'] - 1
4000)The group interpreted the clusters by looking at the nutritional content of the dishes in their respective groups concurrently with the ingredients that make them up. This way, we were able to form insights not only on the healthiness of the dish but also on what ingredients are dominant in terms of their nutritional value.
svd_ing = (
pd.DataFrame(X_new_ing, index=dropped_df_orig_ing['dish_name'])
.reset_index()
)
svd_ing.columns = (
['dish_name'] + ['SV'+str(int(c)+1) for c in svd_ing.columns[1:]]
)
plotly_df = (
pd.merge(
svd_ing[['dish_name', 'SV1', 'SV2', 'SV3', 'SV4', 'SV5']],
X_cluster,
on='dish_name',
how='right'
)
)
def custom_names(x):
"""Custom color for the plot at hand"""
if x == 1: # High Cholesterol, High Sodium
return 'Fatty and Salty Filipino Meal'
elif x == 2: # High Protein & Less Fat, High Vitamins & Fiber
return 'Super Filipino Meal'
elif x == 3: # Bland and low vitamins and minerals
return 'Typical Filipino Meal'
elif x == 4: # Bland but high vitamins and minerals
return 'Upgraded Filipino Meal'
func = np.vectorize(custom_names)
plotly_df['cluster_names'] = func(plotly_df['cluster_no'])
def custom_color(x):
"""Custom color for the plot at hand"""
if x == 1:
return '#DA4D2E' # carrot + patatas, ma-sarsa
elif x == 2:
return '#8D9F38' # dahon, ma-sabaw
elif x == 3:
return '#722502' # carrot + patatas, ma-sabaw
elif x == 4:
return '#EFC564' # suka + toyo, matamis
else:
return 'rgba(249, 248, 252, 0.4)'
func = np.vectorize(custom_color)
plotly_df['plotly_colors'] = func(plotly_df['cluster_no'])
bar_df = (
plotly_df.groupby('cluster_names').median().sort_values('Carbohydrates')
)
bar_df = (
bar_df.drop(['SV1', 'SV2', 'SV3', 'SV4', 'SV5', 'cluster_no'], axis=1)
)
bar_df.columns = (
['Carbohydrates', 'Protein', 'Fat', 'Cholesterol', 'Sodium',
'Potassium', 'Fiber', 'Sugar', 'Vitamin A', 'Vitamin C',
'Calcium', 'Iron']
)
This study ended up with a roster of Filipino dishes clustered into four clusters. From the groups formed, the differences in nutritional content were evident: There was a clear separation between healthy and unhealthy dishes. With further examination, we observe that some key ingredients strongly affect the nutritional value of a dish.
plot_nutrients(bar_df, 'Typical Filipino Meal')
This cluster contains the most number of dishes. We would find the most common Filipino food served in households in this cluster, most notably the world-famous adobo. However, the food here contains the least amount of nutrients in comparison with the three other clusters. Since the typical Filipino food belongs in this cluster, Figure 7 represents the average nutritional value of Filipino Dishes. In other words, the majority of our dishes fall flat when it comes to nutritional value.
Since we have a wide variety of dishes here, it is difficult to generalize what they have in common. What is clear is that they have the strong taste and the bold flavor that is signature to the Filipino Cuisine. It is backed up by the observation that more than eighty percent of the dishes in this cluster have pepper as one of the ingredients. In a more relatable Filipino expression, these are dishes that will make us order an extra cup of rice. More than the nutritional value, Filipinos eat the dishes in this cluster because it satisfies the Pinoy palate. While there are no doubts that the crowd-favorites are here, the nutrition we get from them is dubious.
There was no single ingredient driving the numbers this low, but the apparent lack of leafy vegetables in the dishes in this cluster has a lot to do with the scarcity of nutritional value. The upside here would be that there is a lot of room for reinventions and innovations in the traditional recipes to make them healthier. But as it stands, the traditional recipes we most likely follow, give little nutrition. What the group suggests here is the integration of nutritious ingredients that do not have a strong taste. Bland ingredients could go well with the dishes that inherently have a strong taste. Some examples would be spinach, asparagus, potato, and egg. We also suggest using leaner meat when cooking. In addition to this, we can do away with the pre-made sauces and seasoning and use the more natural ones instead.
plot_nutrients(bar_df, 'Upgraded Filipino Meal')
This cluster is similar to the Typical Filipino Meal Cluster but with slightly better nutritional value. We can also see some crowd-favorites here like Menudo, Afritada, and Caldereta. Most dishes in the Upgraded Filipino Meal Cluster are tomato-based. There is an instant upgrade in terms of nutritional value because it has tomatoes. Some ingredients naturally go well with tomato-based sauces like potatoes, carrots, and various herbs, which explains the improvement in Vitamin A, Potassium, and Vitamin C.
There is still a lack of leafy vegetables in this cluster. We think this is because most healthy vegetables have a bitter taste, which does not usually work with a tomato-based sauce. It is why the vitamin C, calcium, and potassium contents of the food in this cluster trail the ones in the Super Filipino Meal cluster by a significant margin. In terms of meat quality, the type of meat is better here as it uses leaner cuts. We can see this in the uptick in the protein content and just a little uptick in fat content. This means that the dishes here do not necessarily rely on the fat content of the meat for the savory flavor. The taste of the sauce here is more dominant compared to the other ingredients.
In terms of innovation in the recipes, we observe that the recipes here are easily modifiable. We can take the classic Menudo, for example. A healthier variant of this dish can be made by using leaner pork cuts, putting more carrots, and adding more bell pepper; a healthier version without compromising the taste. Changing the meat in the dish is also something to be considered. We can have chicken, pork, and beef versions of Caldereta, Menudo, and Afritada for all we want.
plot_nutrients(bar_df, 'Super Filipino Meal')
With the highest amount of vitamins A and C, Calcium, Iron, Fiber, and Potassium, it is apparent that the healthiest Filipino dishes are in this cluster. It is also high in protein, with less fat and cholesterol, because the meat in the recipes usually uses chicken. The carbohydrates are the highest in this cluster because of starchy vegetables or crops like potatoes and carrots. Even though high carbohydrates in our ulams are sometimes frowned upon because we will still pair these dishes with white rice (all the carbs we need!), the starchy vegetables come with a healthy amount of potassium and vitamins. It is a good enough upside to take in exchange for additional carbohydrates. The amount of iron is also high in this cluster because of ingredients like bagoong and liver spread. Recipes here use green beans and even fruits like pineapple, which causes a high amount of dietary fiber in the cluster.
As we delved deeper into studying the Super Filipino Meal cluster, we found what is probably the healthiest common Pinoy dish ingredient in Pechay or Bok Choy. This leafy vegetable has high amounts of calcium, Vitamin A, and Vitamin C. In addition to this, it goes well with soup or sauce-based dishes such as Nilaga or Kare-Kare. Given the health benefits of Pechay, it made sense that dishes that include this in the recipe are in the healthiest cluster.
We could argue that this cluster has not only the healthiest food but also the most cost-efficient; soup-based serves more people and vegetables are generally cheaper. However, the group identifies some relevant limitations. We observed that dishes in the second cluster usually require lots of ingredients, which may have contributed to why these are not served as often as we think they should. Another factor would be that dishes here can be intimidating for people who do not like vegetables. For example, Kare-Kare is not popular with kids because of the number of vegetables present in this dish. In a typical Filipino household, kids can dictate what food will be served in family meals, and this can be an issue especially when they like food that is relatively unhealthy.
plot_nutrients(bar_df, 'Fatty and Salty Filipino Meal')
Dishes in this cluster are generally salty and have high cholesterol. The group attributes it to several factors, like the parts of the meat used in the dishes. For example, there are recipes here that use Pata (Pork Leg), a pork part known to be high in fat and cholesterol. Viral Pinoy posts about having high blood pressure or even heart attacks are usually associated with dishes like Crispy Pata. The high protein in this cluster is due to the high amount of meat in the dish relative to the other ingredients and not necessarily about the quality of the protein itself. It explains why the fat and cholesterol amounts are through the roof, too. High sodium, on the other hand, could be associated with the use of salt and liquid seasonings in most dishes. Some recipes also require processed goods like corned beef, which has a high sodium content.
In general, dishes in this cluster are unhealthy. There is an unusually high amount of cholesterol, fat, and sodium, which could cause health issues in the long run. Luckily, the usual Filipino food that we eat is not in this cluster. The food here is cooked and served occasionally. In terms of recommendations, the group did not come up with good substitutes or even twists to make the food here healthier. The unhealthiness of the dishes here is mainly because of the type of meat used and the seasonings that come with it. There is just no replacing Pata in Crispy Pata; it makes this Filipino food sinfully good. The same goes with most dishes here; this is not your typical recipe where you can change the meat content, but it essentially stays the same dish.
Not all is lost about including this in a diet. Even though the food in this cluster, in general, is served occasionally, they fit in some diet programs. The ketogenic diet is becoming more and more popular, and dishes in the first cluster are commonly included in this type of diet. Essentially, Keto requires a very high fat intake and ultra-low carbohydrate intake. Pata dishes are the best candidates here. As for how healthy it is, experts seem to have conflicting ideas and takes on Keto. On the one hand, some would argue that it is unhealthy and unsustainable. On the other hand, some would be content and happy with the results, which usually include a drastic fat loss.
Instead of projecting the clustering results on the nutrient components, projecting them on the ingredient components allows us to better understand the nutritional value associated with different ingredient combinations. That is, by looking at the nutritional value of the ingredients' resulting dishes, we can identify ingredient combinations that usually result in healthy dishes, which ultimately allows us to uncover healthier alternative ingredients for otherwise unhealthy dishes.
For readability and ease of understanding, we will only project the results on a two-dimensional ingredient space. With interpretability as the researcher's utmost priority, SV2 and SV5 of the Ingredient Dataset, which cumulatively explains 9.81% of the data's variation, were chosen. Figures 4 and 5 below show how the researchers assigned meanings to these components.
def custom_color(x):
"""Custom color for the plot at hand"""
if x < 0:
return '#722502'
if x > 0:
return '#DA4D2E'
fill_fn = np.vectorize(custom_color)
plot_svd_zoomed(weights_df_ing,
zoom_on='dominant',
num_comp=2,
num_ing=12,
fill_fn=np.vectorize(custom_color),
manual_fill_values=['#8D9F38', '#DA4D2E'])
The dominant ingredients in SV2, from both the positive and the negative end of the spectrum, include a kind of meat and a variety of vegetables. Particularly, ingredients with heavy negative loadings include pork and flavoring plants and vegetables (garlic, pepper, bay leaf), while ingredients with heavy positive loadings include chicken and crop vegetables (potatoes and carrots). At first glance, one could say that the ingredients with positive loadings would generally result in healthier dishes, however, ingredients with negative loadings also have a fair share of vegetables such as kangkong and eggplant. This is a glimpse of how attempting to associate nutritional value by merely looking at the ingredients is not as straightforward as one would expect. These ingredients, or combinations of them, could either define the overall nutritional value of the dish or at times be a game-changer to its nutritional content. In the end, it is important to look at the resulting dishes from these combinations of ingredients to better the extent of the nutritional value they offer.
def custom_color(x):
"""Custom color for the plot at hand"""
if x < 0:
return '#5E6A25'
if x > 0:
return '#8D9F38'
fill_fn = np.vectorize(custom_color)
plot_svd_zoomed(weights_df_ing,
zoom_on='close to zero',
num_comp=5,
num_ing=15,
fill_fn=np.vectorize(custom_color),
manual_fill_values=['#722502', '#722502'])
def custom_color(x):
"""Custom color for the plot at hand"""
if x < 0:
return '#5E6A25'
if x > 0:
return '#8D9F38'
fill_fn = np.vectorize(custom_color)
plot_svd_zoomed(weights_df_ing,
zoom_on='dominant',
num_comp=5,
num_ing=12,
fill_fn=np.vectorize(custom_color),
manual_fill_values=['#F4903E', '#8D9F38'])
SV5 is a bit more complex than the previous component. This time, the whole range of the spectrum, not just its ends, was inspected before an appropriate interpretation was assigned. The first part of Figure 5 shows the dominant ingredients in SV5, from both the positive and the negative end of the spectrum, while the second half shows the ingredients that are close to 0 (i.e., possibly pointing away from SV5). When pieced together, this shows the flavor range of the Filipino dishes: from nutty flavors (peanut and annatto) to salty (soy sauce, hoisin sauce) or sweet (honey, pineapple), and to sour and zesty (sinigang mix, radish).
Using SV2, which represents the meat and vegetable range of Filipino dishes, and SV5, which represents the flavor range of Filipino dishes, the following two-dimensional space was created.
Each point in the scatter plot represents a Filipino dish. The different colors represent the different groupings or clusters of the dishes based on nutrient content. The horizontal position indicates indicates the kinds of meat and vegetables in the dish (from pork and garlic/green vegetables combo to lean meat and crops combo), while the vertical position indicates the dominating flavor of the dish (from nutty, sweet, salty, and sour flavors).
dict_df = plotly_df[['cluster_names', 'plotly_colors']].drop_duplicates()
cmap = dict(zip(dict_df['cluster_names'], dict_df['plotly_colors']))
plotly_clusters(plotly_df, y_ref=0, x_ref=0, c_map=cmap)
It can be observed almost all clusters are clumped together in the sweet and salty flavor range and are spread across the meat and vegetable range. This suggests that regardless of the combination of meat and vegetables found in a Filipino dish, as long as it falls in the sweet and salty flavor range, will almost always fall under the Typical Filipino Meal, the Upgraded Filipino Meal, or the Fatty and Salty Filipino Meal. Interestingly, the farther the dish places towards the left side of the plot, that is, the more they make use of pork and garlic combination, the more likely it is to be Fatty and Salty. In fact, out of all the dishes that use lean meat, only Chicken Hamonado and Pinoy Fried Chicken Recipe were tagged as Fatty and Salty.
Another interesting observation is that there appears to be a linear trend for the meat and vegetable range and the flavor range among the Super Filipino Meal dishes. That is, dishes that make use of the pork and garlic combination, as long as they have a nutty flavor, will likely fall under Super Filipino Meal. Perhaps this is because nutty sauces are often accompanied by vitamin-enriched vegetables such as pechay or bokchoy. Interestingly, Sinigang dishes, which fall under the second quadrant (dishes that make us of pork but have a sour flavor), are spread out across nutrient clusters. This suggests that the current variants of Sinigang were born of experimentation on the flavor and texture (e.g., deep-frying the meat) while giving only little consideration to its nutrient content.
Finally, there are currently no dishes in the first quadrant (dishes that make use of lean meat and crops that are sour). While this could serve a room for innovation from a flavor perspective, this also opens up avenues for innovations from a nutrition standpoint. That is since Super Filipino Meals follow a linear trend in this 2D ingredient space, the researchers wonder if had there been existing Filipino dishes in the first quadrant, would they naturally belong to Super Filipino Meals as well?
In the food business, the word fusion is for a curious and intriguing combination of dishes and cooking techniques from different cuisines that work together. However, given how our cuisine has combined different foreign cooking styles and recipes with our local techniques and native ingredients, we think that it is a fusion in itself. The diversity, creativeness, and boldness of our food were evident in the number of the different recipes in the data set and the ingredients that make them up. This variety translated to a wide range of nutritional values in our food, too. As the group focused on nutrition-based clustering in this study, we were able to produce insights on the healthiness of the Filipino dishes, the common ingredients that give a dish a high nutritional value, and how a data-driven approach to studying our food could have both practical and business use cases.
We presented general insights on each of the clusters. Some of the notable observations we had would be that the most common Filipino dishes fall under the Typical Filipino Meal Cluster, which, unfortunately, has the lowest nutritional value among all the clusters. The crowd-favorites are focused on satisfying the Pinoy palate more than providing the necessary nutrition. As for why the dishes here are the most common other than the taste, it may be because the dishes here are simple to make in that they require the fewest ingredients. Notable in this cluster is the lack of leafy vegetables, which could be incorporated in the classic recipes because the food here has strong flavors. The healthy greens are usually bland with a touch of bitterness, which could be balanced out by the dominant flavors present in the dishes here.
Another interesting observation was that all but one variant of Adobo was in this cluster. The Adobo with Tokwa and Tausi dish was in the Fatty and Salty Filipino Meal Cluster because the recipe included Garlic Fried Rice Adobo. The latter required additional adobo sauce, which led to the dish having a higher-than-expected sodium content than a usual Adobo dish. There was no actual variation in the nutrients of the adobo dishes even if the meat was switched out or additional ingredients such as soft drinks were included in the recipe. This means that the traditional adobo dish is light when it comes to nutritional value. Sinigang variants, on the other hand, covered all the clusters. This shows that even though Sinigang sounds like a standard Filipino dish, mixing up its ingredients can drastically affect the nutritional value. The usage of different pork parts affects the protein, fat, and cholesterol contents, while the amount and variety of vegetables affect the remaining micronutrients.
The healthy Super Filipino Meal Cluster is composed of dishes with green leafy vegetables, most notably Pechay. There are considerably fewer dishes in this cluster compared to the Typical Filipino Meal Cluster, which means that there is not much variety just yet in this cluster. This said, there is room for creating variations among the healthy dishes, but it certainly requires vegetables to be included in the recipe. Making good-tasting dishes that require fewer ingredients can also help in convincing Filipinos to eat healthier. In general, we need to explore using vegetables in more recipes because it was clear that they are the primary drivers for nutrients such as vitamins and other minerals. The same can be said about the Upgraded Filipino Meal Cluster. Dishes here were mostly tomato-based. It includes a few vegetables and crops like carrots, potatoes, and herbs which causes an increase in nutritional value. The initial purpose of this study was to come up with practical applications such as identifying a cost-efficient combination of dishes to serve on occasions, creating a good menu with the right balance of contrasting flavors, and reinventing traditional recipes, which would all be useful for the typical Filipino family. However, as the team did analysis and clustering, we realized that some business use cases will inevitably come out of this study. This study could help food businesses, especially those branding themselves as healthy, create a menu that maximizes nutritional value and profit. The healthy dishes that belong to the Super Filipino Meal Cluster could headline their food choices. They can also reinvent the unhealthy crowd-favorites to a healthier version. Businesses could use both ingredient-based and nutrition-based clustering as their guide in their food research and development.
A good extension of this project would be expanding the dataset to include other dishes and other recipe sources. There are hundreds of dishes in the Philippines, each prepared uniquely depending on the place of origin. Some may be similar, but some can be drastically different (e.g., Luzon's Pochero vs. Mindanao Pochero). Having a larger data set that accounts for these similarities and differences would lead to better research on the nutrients and ingredients of Filipino Dishes. There could also be additional business use cases such as starting a food business with themes like using only one base dish (e.g., different versions of Kare-Kare) or even using just one base ingredient (e.g., Lapu-Lapu ala Su-Tu-Kil). Adobo Connection is one example of this kind of business, although they started having other dishes on their menu during the last few years. From a data-driven perspective, we can see that from the hundreds of recipes and thousands of ingredients, there is so much room for innovation in this space. With data science, there is infinite potential in the food industry.
yeast |
"yeast" |
wrapper |
"wrapper" |
worcestershire_sauce |
"worcestershire" |
winged_bean |
"winged bean" |
vinegar |
"vinegar" |
turmeric |
"tumeric" |
tomato |
"tomato" |
tofu |
"tofu" |
toasted_rice_powder |
"toasted rice powder" |
taro |
"taro" |
sweet_potato |
"sweet potato" |
sugar |
"sugar" |
star_anise |
"star anise" |
squash |
"kalabasa", "squash" |
soy_sauce |
"soy sauce" |
sinigang_mix |
"sinigang" |
shortening |
"shortening" |
sesame_oil |
"sesame oil" |
scallion |
"scallion" |
sayote |
"sayote" |
salted_egg |
"salted egg" |
safflower_oil |
"safflower oil" |
raisins |
"raisins" |
radish |
"radish" |
potato |
"potato" |
pork_insides |
"pig’s liver", "pig’s heart", "pig’s small intestine", "bung", "pig cheeks", "pig heart", "pig kidney", "pig stomach", "pork ears", "pork large intenstine", "pork liver", "small intestine" |
pork_fat |
"pork fat" |
pork_stock |
"pork broth", "pork stock", "pork cube" |
pork_blood |
"pork blood" |
pork_and_beans |
"pork and beans" |
pie_crust |
"pie" |
pickle |
"pickle", "relish" |
pepper_leaf |
"pepper leaves" |
pechay |
"pechay" |
peanut |
"peanut" |
patola |
"patola" |
parsley |
"parsley" |
paprika |
"paprika" |
papaya |
"papaya" |
oyster_sauce |
"oyster" |
onion |
"onion" |
olive_oil |
"olive oil" |
olive |
"olive" |
okra |
"okra" |
nutmeg |
"nutmeg" |
noodle |
"noodle", "pancit", "sotanghon", "misua", "miswa" |
mushroom |
"mushroom" |
munggo |
"mung" |
mirin |
"mirin" |
mayonnaise |
"mayonnaise" |
malunggay |
"malunggay" |
liver_spread |
"liver" |
lemongrass |
"lemongrass" |
leeks |
"leeks" |
lechon_sauce |
"lechon" |
kasubha |
"kasubha" |
kangkong |
"spinach", "kangkong" |
jicama |
"jicama" |
jackfruit |
"jackfruit" |
ice |
"ice" |
hotdog |
"hotdog" |
hot_sauce |
"hot" |
honey |
"honey" |
hoisin_sauce |
"hoisin" |
green_pea |
"green pea", "pigeon pea" |
green_bean |
"green beans", "sitaw", "snake beans", "string beans", "snap pea", "snow pea" |
glutinous_rice |
"glutinous rice" |
ginger |
"ginger" |
ginataang_gulay_mix |
"ginataang" |
garlic |
"garlic" |
flour |
"flour" |
eggplant |
"eggplant", "talong" |
egg |
"egg" |
curry_powder |
"curry" |
cucumber |
"cucumber" |
cream |
"cream" |
cooking_wine |
"wine" |
cooking_oil |
"cooking oil", "vegetable oil" |
coconut_water |
"coconut water" |
coconut_milk |
"coconut cream", "coconut milk" |
coconut_meat |
"coconut meat" |
clear_soda |
"7-up", "sprite", "clear softdrink" |
cinnamon |
"cinnamon" |
sausage |
"chinese sauage", "chorizo" |
chicken_stock |
"chicken broth", "chicken cube" |
chicharon |
"chicharon" |
cheese |
"cheese" |
celery |
"celery" |
carrot |
"carrot" |
canned_meat |
"potted meat", "luncheon meat" |
calamansi |
"calamansi", "lemon", "lime" |
cabbage |
"cabbage" |
butter |
"butter", "margarine" |
broccoli |
"broccoli" |
bread |
"bread" |
bok_choy |
"bok choy", "bokchoy" |
black_soda |
"coke", "cola" |
black_bean |
"black bean" |
beer |
"beer" |
beef_insides |
"lard", "cow", "beef heart", "beef kidney", "beef large instestine", "beef liver", "beef neck bone", "beef small intestine", "bile", "tripe", "tongue", "tripe", "lengua" |
beef_stock |
"beef cube", "beef bouillon", "bulalo", "beef broth", "beef stock" |
bay_leaf |
"bay" |
bamboo_shoots |
"bamboo shoots" |
baking_powder |
"baking powder" |
annatto |
"annatto" |
ampalaya |
"ampalaya" |
adobo_sauce |
"adobo" |
achiote |
"achiote" |
tomato_liquid |
"ketchup", "tomato sauce", "tomato paste", "spaghetti sauce" |
banana_flower |
"blossom" |
pepper |
"white pepper", "black pepper", "crushed pepper", "peppercorn" |
chili |
"chili", "pepper flakes", "serrano pepper", "sili", "jalapeno", "ghost pepper", "green pepper" |
bell_pepper |
"bell pepper" |
bagoong |
"alamang", "shrimp paste", "balaw" |
liquid_seasoning |
"liquid seasoning", "savorrich", "marinade" |
chickpea |
"chick pea", "garbanzos" |
chicken_insides |
"chicken gizzard", "chicken hear", "chicken liver" |
cornstarch |
"cornstarch" |
corned_beef |
"corned beef" |
fish_sauce |
"fish sauce" |
pineapple_juice |
"pineapple juice", "juice from the canned tidbits" |
shrimp_cube |
"shrimp cube" |
watermelon |
"watermelon" |
milk |
"milk" |
pea |
"pea" |
pasta |
"spaghetti", "macaroni" |
shrimp |
"shrimp" |
pineapple |
"pineapple" |
water |
"water" |
salt |
"salt" |
rice |
"rice", "sinangag" |
pork |
"pork", "pig", "lechon" |
corn |
"corn" |
chicken |
"chicken" |
beef |
"beef", "steak", "oxtail", "ox tail", "sirloin", "bistek" |
banana |
"banana", "plantain" |
clusters = X_cluster.groupby('cluster_no')['dish_name'].unique()
for k, v in enumerate(clusters):
print(f'Cluster {k+1}')
print(v)
print(' ')